using <- function(...) {
libs <- unlist(list(...))
need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
if(length(need) > 0){
install.packages(need, repos = "https://cloud.r-project.org")
need <- need[!unlist(lapply(need, require, character.only = T))]
if (length(need) > 0) {
if (!requireNamespace("BiocManager", quietly = T))
install.packages("BiocManager")
BiocManager::install(need)
}
}
lapply(libs, require, character.only = T)
}
using("tidyverse", "DESeq2", "CAGEfightR", "gprofiler2", "pheatmap",
"org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
"BSgenome.Hsapiens.UCSC.hg38", "ggseqlogo", "patchwork")# list of samples and bigWig file names
ds <- list.files(pattern = '.bw') %>%
tibble(f = .) %>%
separate(f, c('Name', 'sign', NA), '\\.', remove = F) %>%
mutate(sign = ifelse(sign == 'plus', 'BigWigPlus', 'BigWigMinus')) %>%
pivot_wider(names_from = "sign", values_from = "f") %>%
mutate(nm = Name) %>%
column_to_rownames("nm") %>%
DataFrame()
# rearrange
bws <- list(plus = ds$BigWigPlus,
minus = ds$BigWigMinus) %>%
lapply(function(x) {
BigWigFileList(x) %>%
`names<-`(ds$Name)
})
# genome
gn <- Seqinfo(genome = 'hg38')[c(paste0('chr', 1:22), 'chrX', 'chrY')]
# read in files
ctss <- quantifyCTSSs(plusStrand = bws$plus,
minusStrand = bws$minus,
design = ds,
genome = gn)
## Checking design...
## Checking supplied genome compatibility...
## Iterating over 1 genomic tiles in 54 samples using 10 worker(s)...
## Importing CTSSs from plus strand...
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
## Importing CTSSs from minus strand...
## Merging strands...
## Formatting output...
## ### CTSS summary ###
## Number of samples: 54
## Number of CTSSs: 27.73 millions
## Sparsity: 94.81 %
## Type of rowRanges: StitchedGPos
## Final object size: 1.12 GBAs shown above, there are ~28M individual CTSSs, lots of which are noise. Going forwards we’ll only consider those that have any reads in more than one sample.
ctss <- calcTPM(ctss) %>%
calcPooled(inputAssay = "TPM") %>%
calcSupport(inputAssay = "counts",
outputColumn = "support",
unexpressed = 0)
sctss <- subset(ctss, support > 1) %>% calcTPM()
tibble(x = rowRanges(ctss)$support) %>%
dplyr::count(x) %>%
ggplot(aes(x = x, y = n)) +
geom_vline(xintercept = 1.5, color = 'red') +
geom_col(aes(fill = x), show.legend = F) +
scale_y_log10(expand = expansion(c(0,.1)),
breaks = 10^(c(2,4,6)),
labels = parse(text = sprintf('10^{%d}', c(2,4,6)))) +
labs(x = "# of samples with >0 reads", y = "Frequency")To call promoters, we’ll pool the signal across all samples to improve power as previously mentioned. Additionally, we’ll only consider CTSSs with pooled TPM >0.1 and aggregate ones within 20bp.
tcs <- clusterUnidirectionally(sctss,
pooledCutoff = 0.1,
mergeDist = 20)
## Splitting by strand...
## Slice-reduce to find clusters...
## Calculating statistics...
## Preparing output...
## Tag clustering summary:
## Width Count Percent
## Total 2232638 1e+02 %
## >=1 1863030 8e+01 %
## >=10 327319 1e+01 %
## >=100 42251 2e+00 %
## >=1000 38 2e-03 %On the other hand, enhancers are characterized by weak bidirectional transcription of capped eRNAs, a feature that can be exploited for their identification. Specifically, 0.95 was chosen as the threshold on the measure of balanced bidirectionality (Bhattacharyya coefficient: ranges between 0 and 1). Given that a rather stringent threshold of balanced transcription is imposed, we use the original unfiltered set of CTSSs here. Subsequently, we require that the candidate enhancers to demonstrate significant bidirectionality in more than 1 samples.
bcs <- clusterBidirectionally(ctss, balanceThreshold = 0.95) %>%
calcBidirectionality(samples = ctss)
## Pre-filtering bidirectional candidate regions...
## Retaining for analysis: 82%
## Splitting by strand...
## Calculating windowed coverage on plus strand...
## Calculating windowed coverage on minus strand...
## Calculating balance score...
## Slice-reduce to find bidirectional clusters...
## Calculating statistics...
## Preparing output...
## # Bidirectional clustering summary:
## Number of bidirectional clusters: 814055
## Maximum balance score: 1
## Minimum balance score: 0.950000014008683
## Maximum width: 2593
## Minimum width: 401enhs <- subset(bcs, bidirectionality > 1)
tibble(x = bcs$bidirectionality) %>%
dplyr::count(x) %>%
ggplot(aes(x = x, y = n)) +
geom_vline(xintercept = 1.5, color = 'red') +
geom_col(aes(fill = x), show.legend = F) +
scale_y_log10(expand = expansion(c(0,.1)),
breaks = 10^(c(1,3,5)),
labels = parse(text = sprintf('10^{%d}', c(1,3,5)))) +
labs(x = "# samples demonstrating bidirectionality", y = "Frequency")Previous filters were at a CTSS-level, now we enforce cluster-level filtering by requiring that a cluster must have >1 sample showing >1 TPM.
tcs <- quantifyClusters(ctss,
clusters = tcs,
inputAssay = "counts") %>%
calcTPM(totalTags = "totalTags") %>%
calcSupport(inputAssay = "TPM",
outputColumn = "support",
unexpressed = 1)
enhs <- quantifyClusters(ctss,
clusters = enhs,
inputAssay = "counts") %>%
calcTPM(totalTags = "totalTags") %>%
calcSupport(inputAssay = "TPM",
outputColumn = "support",
unexpressed = 1)
tibble(x = rowRanges(tcs)$support) %>%
mutate(ttl = sprintf('Unidirectional: %d passes', sum(x > 1))) %>%
dplyr::count(ttl, x) %>%
ggplot(aes(x = x, y = n)) +
geom_vline(xintercept = 1.5, color = 'red') +
geom_col(aes(fill = x), show.legend = F) +
facet_grid(. ~ ttl, scales = "free_y") +
scale_y_log10(expand = expansion(c(0,.1)),
breaks = 10^(c(2,4,6)),
labels = parse(text = sprintf('10^{%d}', c(2,4,6)))) +
labs(x = "# of samples with >1 TPM", y = "Frequency") `
`
So let’s proceed to take those supported by at least 2 samples
tcs2 <- subset(tcs, support > 1)
enhs2 <- subset(enhs, support > 1)We can further assess the “shape” of TSSs based on the distribution of signals: i.e., is there a single dominant peak, or multiple? The interquantile range of signals is one way to assess this pattern, and we only focus on highly expressed clusters for the sake of simplicity.
hi <- tcs2[rowMeans(assay(tcs2, 'TPM')) > 10] %>%
calcShape(pooled = sctss,
outputColumn = "IQR",
shapeFunction = shapeIQR,
lower = 0.05, upper = 0.95)
rowData(hi)$shape <- ifelse(rowData(hi)$IQR < 10, 'Sharp', 'Broad')
rowData(hi)[,c('IQR', 'shape')] %>%
as_tibble() %>%
group_by(shape) %>%
mutate(kind = sprintf('%s: %d', shape, n())) %>%
ungroup() %>%
ggplot(aes(x = IQR)) +
geom_histogram(aes(fill = kind), binwidth = 1) +
geom_vline(xintercept = 9.5, color = 'red', linetype = 'dashed') +
scale_fill_manual(values = c('#3976AF', '#F08536')) +
coord_cartesian(xlim = c(0,100)) +
scale_y_continuous(expand = expansion(c(0,.1))) +
labs(x = "IQR (bp)", y = "Frequency")Are these two classes different at a sequence level? We’ll consider the most enriched peak as the putative TSS in each cluster
pmts <- rowRanges(hi) %>% swapRanges() # get the peak instead of the entire cluster`
BSgenome.Hsapiens.UCSC.hg38 instead of hg19`
tcs2 <- assignTxType(tcs2,
txModels = TxDb.Hsapiens.UCSC.hg38.knownGene,
outputColumn = "peakTxType",
swap = "thick")
## Finding hierachical overlaps with swapped ranges...
## ### Overlap summary: ###
## txType count percentage
## 1 promoter 22971 38.2
## 2 proximal 5611 9.3
## 3 fiveUTR 1257 2.1
## 4 threeUTR 3882 6.5
## 5 CDS 4945 8.2
## 6 exon 879 1.5
## 7 intron 6285 10.4
## 8 antisense 7748 12.9
## 9 intergenic 6577 10.9enhs2 <- assignTxType(enhs2,
txModels = TxDb.Hsapiens.UCSC.hg38.knownGene,
outputColumn = "peakTxType",
swap = "thick")
## Finding hierachical overlaps with swapped ranges...
## ### Overlap summary: ###
## txType count percentage
## 1 promoter 1192 40.3
## 2 proximal 862 29.1
## 3 fiveUTR 15 0.5
## 4 threeUTR 39 1.3
## 5 CDS 24 0.8
## 6 exon 42 1.4
## 7 intron 474 16.0
## 8 antisense 0 0.0
## 9 intergenic 310 10.5From the previous transcript-based annotation we can easily apply aggregation to obtain gene-level groupings
tcs2 <- assignGeneID(tcs2,
geneModels = TxDb.Hsapiens.UCSC.hg38.knownGene,
outputColumn = "geneID")
## Extracting genes...
## 1613 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
## Overlapping while taking distance to nearest TSS into account...
## Finding hierachical overlaps...
## ### Overlap Summary: ###
## Features overlapping genes: 67.93 %
## Number of unique genes: 16163As a sanity check we can do differential gene expression analysis using the aggregated count matrix (a pair of K27M-KO samples vs two K27M). And indeed we find ample DEGS, with IGF2 being a known upregulation target of K27M.
symbols <- mapIds(org.Hs.eg.db,
keys = rowData(tcs2)$geneID,
keytype = "ENTREZID",
column = "SYMBOL")
rowData(tcs2)$symbol <- as.character(symbols)
genelevel <- quantifyGenes(tcs2,
genes = "geneID",
inputAssay = "counts")
m <- assay(genelevel)
cd <- tibble(samp = colnames(m)) %>%
mutate(cond = ifelse(grepl('^cont', samp), 'ctrl', 'uc')) %>%
column_to_rownames('samp')
dds <- DESeqDataSetFromMatrix(m, cd, ~cond) %>% DESeq()
res <- results(dds)
resLFC <- lfcShrink(dds, 2, type = 'apeglm')
volc <- function(r, x, y, ttl) {
d <- as.data.frame(r) %>%
dplyr::rename(x = !!x, y = !!y) %>%
mutate(kind = case_when(abs(x) > 2 & y < .01 ~ 'DE',
abs(x) > 2 ~ 'big',
y < .01 ~ 'sig',
T ~ 'NS'),
y = -log10(y))
ct <- na.omit(d) %>%
dplyr::filter(kind == 'DE') %>%
mutate(up = x > 0) %>%
dplyr::count(up) %>%
mutate(x = ifelse(up, Inf, -Inf),
y = Inf,
h = as.numeric(up))
ggplot(d, aes(x, y, color = kind)) +
geom_vline(xintercept = c(-2, 2), linetype = 'dashed') +
geom_hline(yintercept = 2, linetype = 'dashed') +
geom_point(alpha = .5) +
geom_label(aes(x = x, y = y, label = n, hjust = h),
vjust = 1, data = ct, inherit.aes = F) +
scale_y_continuous() +
scale_color_manual(values = c('forestgreen', 'red2', 'grey30', 'royalblue')) +
labs(x = x, y = y, title = ttl)
}
volc(resLFC, 'log2FoldChange', 'padj', 'Gene-level')With CAGE we’d like to look the expression from individual TSS clusters: here we’ll just consider those that were successfully annotated to genes and constitute >10% of their cognate gene’s total expression level in >1 sample.
tc <- subset(tcs2, !is.na(geneID)) %>%
calcComposition(outputColumn = "composition",
unexpressed = 0.1,
genes = "geneID")
tibble(x = rowRanges(tc)$composition) %>%
dplyr::count(x) %>%
ggplot(aes(x = x, y = n)) +
geom_vline(xintercept = 1.5, color = 'red') +
geom_col(aes(fill = x), show.legend = F) +
scale_y_log10(expand = expansion(c(0,.1)),
breaks = 10^(1:4),
labels = parse(text = sprintf('10^{%d}', 1:4))) +
labs(x = "# of samples with >10% contribution", y = "Frequency") Now we proceed like we’ve done for genes.
m <- assay(tc)
dds <- DESeqDataSetFromMatrix(m, cd, ~cond) %>% DESeq()
res2 <- results(dds)
resLFC2 <- lfcShrink(dds, 2, type = 'apeglm')
volc(resLFC2, 'log2FoldChange', 'padj', 'Cluster-level')